{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature: \"Jaccard with WHQ\" (@dasolmar)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Based on the kernel [XGB with whq jaccard](https://www.kaggle.com/dasolmar/xgb-with-whq-jaccard/) by David Solis."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This utility package imports `numpy`, `pandas`, `matplotlib` and a helper `kg` module into the root namespace."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pygoose import *"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "NLTK tools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "from nltk.corpus import stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /Users/xshuai/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Config"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Automatically discover the paths to various data folders and compose the project structure."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "project = kg.Project.discover()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Identifier for storing these features on disk and referring to them later."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_list_id = '3rdparty_dasolmar_whq'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Original question sets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('')\n",
    "df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "NLTK built-in stopwords."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "stops = set(stopwords.words(\"english\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# If a word appears only once, we ignore it completely (likely a typo)\n",
    "# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller\n",
    "def get_weight(count, eps=10000, min_count=2):\n",
    "    return 0 if count < min_count else 1 / (count + eps)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_word_count(x, df, word):\n",
    "    x['das_q1_' + word] = df['question1'].apply(lambda x: (word in str(x).lower())*1)\n",
    "    x['das_q2_' + word] = df['question2'].apply(lambda x: (word in str(x).lower())*1)\n",
    "    x['das_' + word + '_both'] = x['das_q1_' + word] * x['das_q2_' + word]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)\n",
    "words = (\" \".join(train_qs)).lower().split()\n",
    "counts = Counter(words)\n",
    "weights = {word: get_weight(count) for word, count in counts.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def word_shares(row):\n",
    "    q1_list = str(row['question1']).lower().split()\n",
    "    q1 = set(q1_list)\n",
    "    q1words = q1.difference(stops)\n",
    "    if len(q1words) == 0:\n",
    "        return '0:0:0:0:0:0:0:0'\n",
    "\n",
    "    q2_list = str(row['question2']).lower().split()\n",
    "    q2 = set(q2_list)\n",
    "    q2words = q2.difference(stops)\n",
    "    if len(q2words) == 0:\n",
    "        return '0:0:0:0:0:0:0:0'\n",
    "\n",
    "    words_hamming = sum(1 for i in zip(q1_list, q2_list) if i[0]==i[1])/max(len(q1_list), len(q2_list))\n",
    "\n",
    "    q1stops = q1.intersection(stops)\n",
    "    q2stops = q2.intersection(stops)\n",
    "\n",
    "    q1_2gram = set([i for i in zip(q1_list, q1_list[1:])])\n",
    "    q2_2gram = set([i for i in zip(q2_list, q2_list[1:])])\n",
    "\n",
    "    shared_2gram = q1_2gram.intersection(q2_2gram)\n",
    "\n",
    "    shared_words = q1words.intersection(q2words)\n",
    "    shared_weights = [weights.get(w, 0) for w in shared_words]\n",
    "    q1_weights = [weights.get(w, 0) for w in q1words]\n",
    "    q2_weights = [weights.get(w, 0) for w in q2words]\n",
    "    total_weights = q1_weights + q1_weights\n",
    "\n",
    "    R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share\n",
    "    R2 = len(shared_words) / (len(q1words) + len(q2words) - len(shared_words)) #count share\n",
    "    R31 = len(q1stops) / len(q1words) #stops in q1\n",
    "    R32 = len(q2stops) / len(q2words) #stops in q2\n",
    "    Rcosine_denominator = (np.sqrt(np.dot(q1_weights,q1_weights))*np.sqrt(np.dot(q2_weights,q2_weights)))\n",
    "    Rcosine = np.dot(shared_weights, shared_weights)/Rcosine_denominator\n",
    "    if len(q1_2gram) + len(q2_2gram) == 0:\n",
    "        R2gram = 0\n",
    "    else:\n",
    "        R2gram = len(shared_2gram) / (len(q1_2gram) + len(q2_2gram))\n",
    "    return '{}:{}:{}:{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R31, R32, R2gram, Rcosine, words_hamming)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
      "of pandas will change to not sort by default.\n",
      "\n",
      "To accept the future behavior, pass 'sort=False'.\n",
      "\n",
      "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
      "\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n",
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:30: RuntimeWarning: invalid value encountered in double_scalars\n",
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:35: RuntimeWarning: invalid value encountered in true_divide\n",
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:30: RuntimeWarning: invalid value encountered in long_scalars\n"
     ]
    }
   ],
   "source": [
    "df = pd.concat([df_train, df_test])\n",
    "df['word_shares'] = df.apply(word_shares, axis=1, raw=True)\n",
    "\n",
    "x = pd.DataFrame()\n",
    "\n",
    "x['das_word_match']       = df['word_shares'].apply(lambda x: float(x.split(':')[0]))\n",
    "x['das_word_match_2root'] = np.sqrt(x['das_word_match'])\n",
    "x['das_tfidf_word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[1]))\n",
    "x['das_shared_count']     = df['word_shares'].apply(lambda x: float(x.split(':')[2]))\n",
    "\n",
    "x['das_stops1_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[3]))\n",
    "x['das_stops2_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[4]))\n",
    "x['das_shared_2gram']     = df['word_shares'].apply(lambda x: float(x.split(':')[5]))\n",
    "x['das_cosine']           = df['word_shares'].apply(lambda x: float(x.split(':')[6]))\n",
    "x['das_words_hamming']    = df['word_shares'].apply(lambda x: float(x.split(':')[7]))\n",
    "x['das_diff_stops_r']     = np.abs(x['das_stops1_ratio'] - x['das_stops2_ratio'])\n",
    "\n",
    "x['das_len_q1'] = df['question1'].apply(lambda x: len(str(x)))\n",
    "x['das_len_q2'] = df['question2'].apply(lambda x: len(str(x)))\n",
    "x['das_diff_len'] = np.abs(x['das_len_q1'] - x['das_len_q2'])\n",
    "\n",
    "x['das_caps_count_q1'] = df['question1'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))\n",
    "x['das_caps_count_q2'] = df['question2'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))\n",
    "x['das_diff_caps'] = np.abs(x['das_caps_count_q1'] - x['das_caps_count_q2'])\n",
    "\n",
    "x['das_len_char_q1'] = df['question1'].apply(lambda x: len(str(x).replace(' ', '')))\n",
    "x['das_len_char_q2'] = df['question2'].apply(lambda x: len(str(x).replace(' ', '')))\n",
    "x['das_diff_len_char'] = np.abs(x['das_len_char_q1'] - x['das_len_char_q2'])\n",
    "\n",
    "x['das_len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))\n",
    "x['das_len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))\n",
    "x['das_diff_len_word'] = np.abs(x['das_len_word_q1'] - x['das_len_word_q2'])\n",
    "\n",
    "x['das_avg_word_len1'] = x['das_len_char_q1'] / x['das_len_word_q1']\n",
    "x['das_avg_word_len2'] = x['das_len_char_q2'] / x['das_len_word_q2']\n",
    "x['das_diff_avg_word'] = np.abs(x['das_avg_word_len1'] - x['das_avg_word_len2'])\n",
    "\n",
    "# x['exactly_same'] = (df['question1'] == df['question2']).astype(int)\n",
    "# x['duplicated'] = df.duplicated(['question1','question2']).astype(int)\n",
    "\n",
    "whq_words = ['how', 'what', 'which', 'who', 'where', 'when', 'why']\n",
    "for whq in whq_words:\n",
    "    add_word_count(x, df, whq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "whq_columns_q1 = ['das_q1_' + whq for whq in whq_words]\n",
    "whq_columns_q2 = ['das_q2_' + whq for whq in whq_words]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "x['whq_count_q1'] = x[whq_columns_q1].sum(axis=1)\n",
    "x['whq_count_q2'] = x[whq_columns_q2].sum(axis=1)\n",
    "x['whq_count_diff'] = np.abs(x['whq_count_q1'] - x['whq_count_q2'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Features: ['das_word_match', 'das_word_match_2root', 'das_tfidf_word_match', 'das_shared_count', 'das_stops1_ratio', 'das_stops2_ratio', 'das_shared_2gram', 'das_cosine', 'das_words_hamming', 'das_diff_stops_r', 'das_len_q1', 'das_len_q2', 'das_diff_len', 'das_caps_count_q1', 'das_caps_count_q2', 'das_diff_caps', 'das_len_char_q1', 'das_len_char_q2', 'das_diff_len_char', 'das_len_word_q1', 'das_len_word_q2', 'das_diff_len_word', 'das_avg_word_len1', 'das_avg_word_len2', 'das_diff_avg_word', 'das_q1_how', 'das_q2_how', 'das_how_both', 'das_q1_what', 'das_q2_what', 'das_what_both', 'das_q1_which', 'das_q2_which', 'das_which_both', 'das_q1_who', 'das_q2_who', 'das_who_both', 'das_q1_where', 'das_q2_where', 'das_where_both', 'das_q1_when', 'das_q2_when', 'das_when_both', 'das_q1_why', 'das_q2_why', 'das_why_both', 'whq_count_q1', 'whq_count_q2', 'whq_count_diff']\n"
     ]
    }
   ],
   "source": [
    "feature_names = list(x.columns.values)\n",
    "print(\"Features: {}\".format(feature_names))\n",
    "\n",
    "X_train = x[:df_train.shape[0]].values\n",
    "X_test  = x[df_train.shape[0]:].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "project.save_features(X_train, X_test, feature_names, feature_list_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
